
* ==============================================================================
* Ozone NAPS air quality data cleaning. 
* This file imports raw (excel file) NAPS data into stata, drops variables, and 
* merges into one longitudinal dataset. 

* The original raw data is reported by monitor. This file aggregates across monitors
* within a given CMA/CA using the aggregation directions provided by the Canada 
* Wide Standards. The final dataset produces a variable "O3_CMA", that gives the 
* CMA/CA's Ozone concentration needed to determine compliance with the CWS. If a
* CMA/CA-year has a O3_CMA higher than 65 they are not in compliance with the CWS.  

* Note that the folder structure in the raw data folder is preserved in the stata 
* data folder. 

* Note that if using the data files provided with the REStat replication 
* ("StationList_withCMA_SecondJoin.dta" and "O3_NAPS_Data.dta") please start
* from Step 4. 

* Author: Nouri Najjar
* Created: August 2015
* Last Updated: November 2019
* ==============================================================================

* ------------------------------------------------------------
* Preamble: Define data directories. Must be adjusted accordingly.
* ------------------------------------------------------------
* Raw data files
*local datadir_Raw D:\Google Drive\NAPS Data - Public\Ozone\Raw Data
*local datadir_Stata D:\Google Drive\NAPS Data - Public\Ozone\Stata Data\
local datadir_Main D:\REStat Data Archive\AQ Data\
local datadir_GIS D:\REStat Data Archive\AQ Data\

* ------------------------------------------------------------

/*
* ------------------------------------------------------------
* Step 1: Import each excel file into stata. Computes the 4th highest daily maximum 
* ozone reading for each station. Drops duplicate NAPSID_Year observations and 
* variables. 
* ------------------------------------------------------------

local files : dir "`datadir_Raw'" files "*.xlsx"

foreach file_name in `files' {
	cd "`datadir_Raw'"
	* Import file
	import excel using `file_name', allstring
	* Create NAPSID indicator
	gen NAPSID = real(substr(A,2,6))
	* Year
	gen Year = real(substr(A,8,4))
	* Month
	gen Month = real(substr(A,12,2))
	* Day
	gen Day = real(substr(A,14,2))
	* Year-Month-Day
	gen Year_Month_Day = substr(A,8,8)
	* Hourly air quality data
	gen H1 = real(B)
	gen H2 = real(C) 
	gen H3 = real(D)
	gen H4 = real(E)
	gen H5 = real(F)
	gen H6 = real(G)
	gen H7 = real(H)
	gen H8 = real(I)
	gen H9 = real(J)
	gen H10 = real(K)
	gen H11 = real(L)
	gen H12 = real(M)
	gen H13 = real(N)
	gen H14 = real(O)
	gen H15 = real(P)
	gen H16 = real(Q)
	gen H17 = real(R)
	gen H18 = real(S)
	gen H19 = real(T)
	gen H20 = real(U)
	gen H21 = real(V)
	gen H22 = real(W)
	gen H23 = real(X)
	gen H24 = real(Y)
	gen H25 = real(Z)
	gen H26 = real(AA)
	gen H27 = real(AB)
	* Drop old variables
	drop A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA AB
	* reshape data
	reshape long H, i(NAPSID Year_Month_Day) j(hourly)
	* Convert 999 to missing
	recode H (999 = .)
	* H for computing averages
	gen H_store = H
	recode H_store (.=0)
	* 8-Hour running Aves
	bysort NAPSID (Year_Month_Day hourly): gen Running8HrAve = (H_store[_n] + H_store[_n-1] + H_store[_n-2] + H_store[_n-3] + H_store[_n-4] + H_store[_n-5] + H_store[_n-6] + H_store[_n-7])/(cond(missing(H[_n])==0,1,0) + cond(missing(H[_n-1])==0,1,0) + cond(missing(H[_n-2])==0,1,0) + cond(missing(H[_n-3])==0,1,0) + cond(missing(H[_n-4])==0,1,0) + cond(missing(H[_n-5])==0,1,0) + cond(missing(H[_n-6])==0,1,0) + cond(missing(H[_n-7])==0,1,0))
	* Daily maximum 8-hour ave for each site-day
	bysort NAPSID Year_Month_Day (hourly): egen DMax_8Hr = max(Running8HrAve)
	* Fourth highest daily maximum for each site
	gen Dmax_sort = -DMax_8Hr
	bysort NAPSID (Dmax_sort): gen FourthHighest = DMax_8Hr[4]
	* Find duplicate station years and drop
	bysort NAPSID Year: gen dupStationYear = cond(_N==1,0,_n)
	drop if dupStationYear>1
	* Drop unneeded variables
	drop Year_Month_Day hourly Month Day H H_store Running8HrAve DMax_8Hr Dmax_sort dupStationYear
	* Save file
	cd "`datadir_Stata'"
	save "`file_name'.dta", replace
	clear
}

* ------------------------------------------------------------
* Step 2: Create master longitudinal dataset
* ------------------------------------------------------------
* Initiate longitudinal dataset
use "`datadir_Stata'1998o3.xlsx.dta"
save "`datadir_Main'O3_NAPS_Data.dta", replace

* Open other datasets and sort
local files : dir "`datadir_Stata'" files "*.dta"
cd "`datadir_Stata'"

* Exclude base file
local exclude "1998o3.xlsx.dta"
local usefiles: list files - exclude

foreach file_name in `usefiles' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID)
use "`datadir_Main'O3_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Main'O3_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'" files "*.dta"
cd "`datadir_Stata'"

* Exclude base file
local exclude "1998o3.xlsx.dta"
local usefiles: list files - exclude

foreach file_name in `usefiles' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name'
	drop _merge
}

save "`datadir_Main'O3_NAPS_Data.dta", replace
* ------------------------------------------------------------

* ------------------------------------------------------------
* Step 3: Rename ozone variable and compute 3-year average for
* each station from 2000 on.
* ------------------------------------------------------------
* Open data
use "`datadir_Main'O3_NAPS_Data.dta", clear

rename FourthHighest O3

* Declare panel
tsset NAPSID Year
* Fill panel
tsfill, full

* add zeros to missing values for 3-year ave computation
gen O3_Store = O3 if O3!=0
recode O3_Store (.=0)

* Count number of missing years in previous three years
bysort NAPSID (Year): gen MissCount = missing(O3[_n]) + missing(O3[_n-1]) + missing(O3[_n-2])

* 3 year ave using all three years for sites with enough data to do so
bysort NAPSID (Year): gen O3_3YearAve = (O3_Store[_n] + O3_Store[_n-1] + O3_Store[_n-2])/3 if MissCount==0
* 3 year ave using two years for sites with enough data to do so
bysort NAPSID (Year): replace O3_3YearAve = (O3_Store[_n] + O3_Store[_n-1] + O3_Store[_n-2])/2 if MissCount==1
* 3 year ave using 1 year for sites with enough data to do so
bysort NAPSID (Year): replace O3_3YearAve = (O3_Store[_n] + O3_Store[_n-1] + O3_Store[_n-2]) if MissCount==2

* Save 
save "`datadir_Main'O3_NAPS_Data.dta", replace
* ------------------------------------------------------------
*/
* ------------------------------------------------------------
* Step 4: merge in CMA/CA
* ------------------------------------------------------------
/* Not required.
* Open station list with CMA
use "`datadir_Main'O3StationList_WithCMA.dta", clear
* Sort on NAPSID
sort NAPSID
* Save file
save "`datadir_Main'O3StationList_WithCMA.dta", replace
*/

* Open master file
use "`datadir_Main'O3_NAPS_Data.dta", clear
* Sort on NAPSID
sort NAPSID
* Merge in CMA data
merge m:1 NAPSID using "`datadir_GIS'StationList_withCMA_SecondJoin.dta"
drop if _merge==2
drop _merge
* Save
save "`datadir_Main'O3_NAPS_Data.dta", replace
* ------------------------------------------------------------

* ------------------------------------------------------------
* Step 5: produce geographic max by CMA and save data at CMA level
* ------------------------------------------------------------
* Compute max for CMA-year
bysort CMAUID Year: egen O3_CMA = max(O3)
* Compute duplicate indicator for CMA-year
bysort CMAUID Year: gen DupCMAYear = cond(_N==1,0,_n)

* Drop duplicate CMA Year
drop if DupCMAYear>1
* Drop missing CMAs
drop if missing(CMAUID)==1
* Drop variables
drop NAPSID NAPSID O3 O3_Store MissCount O3_3YearAve DupCMAYear

* Save
save "`datadir_Main'O3_NAPS_Data_WithCMA.dta", replace
* ------------------------------------------------------------


* ------------------------------------------------------------
* Step 6: Create 3 year mean for CMA
* ------------------------------------------------------------
* add zeros to missing values for 3-year ave computation
gen O3_CMA_Store = O3_CMA if O3_CMA!=0
recode O3_CMA_Store (.=0)

* Count number of missing years in previous three years
bysort CMAUID (Year): gen MissCount_CMA = missing(O3_CMA[_n]) + missing(O3_CMA[_n-1]) + missing(O3_CMA[_n-2])

* 3 year ave using all three years for sites with enough data to do so
bysort CMAUID (Year): gen O3_CMA_3YearAve = (O3_CMA_Store[_n] + O3_CMA_Store[_n-1] + O3_CMA_Store[_n-2])/3 if MissCount_CMA==0
* 3 year ave using two years for sites with enough data to do so
bysort CMAUID (Year): replace O3_CMA_3YearAve = (O3_CMA_Store[_n] + O3_CMA_Store[_n-1] + O3_CMA_Store[_n-2])/2 if MissCount_CMA==1
* 3 year ave using 1 year for sites with enough data to do so
bysort CMAUID (Year): replace O3_CMA_3YearAve = (O3_CMA_Store[_n] + O3_CMA_Store[_n-1] + O3_CMA_Store[_n-2]) if MissCount_CMA==2

* Keep variables
keep Year CMAUID CMANAME O3_CMA O3_CMA_3YearAve

* Save
save "`datadir_Main'O3_NAPS_Data_WithCMA.dta", replace
* ------------------------------------------------------------
